0.1 plot fev1 profiles of discovery and validation dataset

demoPre <- rbind(demo1_dis, demo1_val)
demoPre$Set <- rep(c("Discovery cohort", "Validation cohort"), c(nrow(demo1_dis), nrow(demo1_val)))
  
demo.core.fev1 <- demoPre[, c("BLFEV", "F10L","F20L","F30L",
                                    "F45L","F60L", "F90L","F120L",
                                    "F180L","F240L","F300L","F360L","F420L")]
demo.core.fev1.scale <- 100*scale(t(demo.core.fev1), center=demo.core.fev1$BLFEV, scale=demo.core.fev1$BLFEV)

fev1Dat <- demo.core.fev1.scale %>% as.data.frame %>% 
  mutate(time = c(0, as.numeric(gsub("F|L", "", rownames(demo.core.fev1.scale)[-1])))/60) %>% 
  gather(Subj, dropFEV1, -time)
pdf(paste0(rnaelements_dir, "/results/Figure4a.pdf"), width = 15, height = 6.5)
fev1Dat %>% mutate(Set = factor(demoPre[fev1Dat$Subj, "Set"], c("Discovery cohort", "Validation cohort")), 
         Response = factor(demoPre[fev1Dat$Subj, "calculated_Response"], levels = c("ER", "DR")),
         subj = demoPre[fev1Dat$Subj, "concealed_uniqueID"]) %>% 
  ggplot(aes(x = time, y = dropFEV1, group = subj, color = Response)) +
  geom_segment(aes(x = 3, y = -15, xend = 7, yend = -15), col = "black", linetype = "dashed") +
  geom_point(size=3) +
  geom_line(size=1) + 
  stat_smooth(aes(group = Response)) + stat_summary(aes(group = Response), geom = "point", fun.y = mean, shape = 17, size = 3) +
  facet_wrap(~Set) +
  customTheme(sizeStripFont = 25, xAngle = 0, hjust = 0.5, vjust = 0.5, xSize = 20, ySize = 20, xAxisSize = 20, yAxisSize = 20) +
  geom_segment(aes(x = -0.08, y = -60, xend = -0.08, yend = -55), arrow = arrow(length = unit(0.3, "cm")), col = "black") +
  annotate("text", x=1.7, y=-58, label="Blood draw", cex = 7) +
  scale_y_continuous(expression('Percent drop in '~ FEV[1])) + xlab("Time (hours)") +
  theme(legend.text = element_text(size = 20)) +
  geom_hline(yintercept = 0, linetype = "dashed", col = "gray")
dev.off()
## png 
##   2

0.2 demographics

nrow(demo1_dis); nrow(demo1_val)
## [1] 29
## [1] 33
table(demo1_dis$calculated_Response); table(demo1_val$calculated_Response)
## 
## DR ER 
## 14 15
## 
## DR ER 
## 24  9
table(demo1_val$SEX, demo1_val$calculated_Response)
##    
##     DR ER
##   F 11  5
##   M 13  4
11/24 # female (dr)
## [1] 0.4583333
5/9 # female (er)
## [1] 0.5555556
table(as.character(demo1_val$Allergen_cleanLabel), demo1_val$calculated_Response)
##          
##           DR ER
##   Cat      9  7
##   Grass    4  2
##   HDM      8  0
##   Horse    1  0
##   Ragweed  2  0
table(demo1_val$SITE, demo1_val$calculated_Response)
##        
##         DR ER
##   LAVAL  9  6
##   MAC   12  1
##   UBC    3  2
## Weight
fit <- lm(demo1_val$Wt..Kg. ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.9741, p-value = 0.6561
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) # p=0.51
## 
## Call:
## lm(formula = demo1_val$Wt..Kg. ~ demo1_val$calculated_Response)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.709  -8.745  -1.209   9.416  20.405 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       72.209      2.480  29.113   <2e-16 ***
## demo1_val$calculated_ResponseER   -3.214      4.803  -0.669    0.509    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.63 on 28 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.01574,    Adjusted R-squared:  -0.01941 
## F-statistic: 0.4478 on 1 and 28 DF,  p-value: 0.5089
## height
fit <- lm(demo1_val$HT.cm. ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.96828, p-value = 0.4534
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) #pt = 0.43
## 
## Call:
## lm(formula = demo1_val$HT.cm. ~ demo1_val$calculated_Response)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.154  -6.393  -2.132   8.129  16.846 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      171.154      1.883  90.872   <2e-16 ***
## demo1_val$calculated_ResponseER   -3.044      3.767  -0.808    0.425    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.227 on 30 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.02131,    Adjusted R-squared:  -0.01132 
## F-statistic: 0.6531 on 1 and 30 DF,  p-value: 0.4254
## Age 
fit <- lm(demo1_val$AGE ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## not normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.87783, p-value = 0.001487
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demo1_val$AGE[demo1_val$calculated_Response == "ER"], 
            y = demo1_val$AGE[demo1_val$calculated_Response == "DR"])  # pw=0.35
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  demo1_val$AGE[demo1_val$calculated_Response == "ER"] and demo1_val$AGE[demo1_val$calculated_Response == "DR"]
## W = 131.5, p-value = 0.3518
## alternative hypothesis: true location shift is not equal to 0
## BLFEV 
fit <- lm(demo1_val$BLFEV ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.97164, p-value = 0.5268
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) #p = 0.14
## 
## Call:
## lm(formula = demo1_val$BLFEV ~ demo1_val$calculated_Response)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4350 -0.4322 -0.1750  0.3678  1.6150 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       3.4950     0.1518  23.023   <2e-16 ***
## demo1_val$calculated_ResponseER  -0.4428     0.2907  -1.523    0.138    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7437 on 31 degrees of freedom
## Multiple R-squared:  0.06964,    Adjusted R-squared:  0.03962 
## F-statistic:  2.32 on 1 and 31 DF,  p-value: 0.1378
## PRFEV
fit <- lm(demo1_val$PRFEV ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.9316, p-value = 0.06768
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) #p = 0.98
## 
## Call:
## lm(formula = demo1_val$PRFEV ~ demo1_val$calculated_Response)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.298 -0.478 -0.188  0.682  1.191 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.67800    0.15441  23.820   <2e-16 ***
## demo1_val$calculated_ResponseER -0.00925    0.28887  -0.032    0.975    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6905 on 26 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  3.944e-05,  Adjusted R-squared:  -0.03842 
## F-statistic: 0.001025 on 1 and 26 DF,  p-value: 0.9747
## EAR
fit <- lm(demo1_val$EAR ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.96757, p-value = 0.4162
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) #p = 0.21
## 
## Call:
## lm(formula = demo1_val$EAR ~ demo1_val$calculated_Response)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.312  -4.812   1.387   6.688  15.188 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -36.388      1.829 -19.891   <2e-16 ***
## demo1_val$calculated_ResponseER    4.465      3.503   1.275    0.212    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.962 on 31 degrees of freedom
## Multiple R-squared:  0.04981,    Adjusted R-squared:  0.01916 
## F-statistic: 1.625 on 1 and 31 DF,  p-value: 0.2119
## LAR
fit <- lm(demo1_val$LAR ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## not normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.92725, p-value = 0.02924
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demo1_val$LAR[demo1_val$calculated_Response == "ER"], 
            y = demo1_val$LAR[demo1_val$calculated_Response == "DR"])  # p=1.388e-05
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  demo1_val$LAR[demo1_val$calculated_Response == "ER"] and demo1_val$LAR[demo1_val$calculated_Response == "DR"]
## W = 216, p-value = 1.388e-05
## alternative hypothesis: true location shift is not equal to 0
## PC20
fit <- lm(demo1_val$PC20 ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## not normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.89127, p-value = 0.003187
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demo1_val$PC20[demo1_val$calculated_Response == "ER"], 
            y = demo1_val$PC20[demo1_val$calculated_Response == "DR"])  # pw=0.49
## 
##  Wilcoxon rank sum test
## 
## data:  demo1_val$PC20[demo1_val$calculated_Response == "ER"] and demo1_val$PC20[demo1_val$calculated_Response == "DR"]
## W = 126, p-value = 0.4862
## alternative hypothesis: true location shift is not equal to 0
## AIS
fit <- lm(demo1_val$AIS ~ demo1_val$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.92873, p-value = 0.05099
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) # Pt = 0.04
## 
## Call:
## lm(formula = demo1_val$AIS ~ demo1_val$calculated_Response)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4558 -0.9803 -0.5469  1.0149  4.9393 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       2.9178     0.3342   8.730  2.4e-09 ***
## demo1_val$calculated_ResponseER  -1.5656     0.7348  -2.131   0.0424 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.603 on 27 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.1439, Adjusted R-squared:  0.1122 
## F-statistic:  4.54 on 1 and 27 DF,  p-value: 0.04238
# variables that are NOT normally distributed
demo1_val %>% dplyr::select(AGE, LAR, PC20, calculated_Response) %>% 
  gather(DEMO, Value, -calculated_Response) %>% 
  group_by(DEMO, calculated_Response) %>% 
  dplyr::summarise(Median = round(median(Value, na.rm = TRUE), 2), 
    Q1 = quantile(Value, 0.25, na.rm = TRUE), Q3 = quantile(Value, 0.75, na.rm = TRUE))
## # A tibble: 6 x 5
## # Groups:   DEMO [?]
##   DEMO  calculated_Response Median     Q1     Q3
##   <chr> <chr>                <dbl>  <dbl>  <dbl>
## 1 AGE   DR                   25.5   21.8   41   
## 2 AGE   ER                   31     27     36   
## 3 LAR   DR                  -23.8  -30.9  -17.5 
## 4 LAR   ER                   -7.2  -11.1   -4.9 
## 5 PC20  DR                    1.58   0.69   4.08
## 6 PC20  ER                    5.13   0.64   5.45
# variables that are normally distributed
demo1_val %>% dplyr::select(Wt..Kg., HT.cm., BLFEV, PRFEV, EAR, AIS, calculated_Response) %>% 
  gather(DEMO, Value, -calculated_Response) %>% 
  group_by(DEMO, calculated_Response) %>% 
  summarise(Mean = round(mean(Value, na.rm = TRUE), 2), SD = round(sd(Value, na.rm = TRUE), 2))
## # A tibble: 12 x 4
## # Groups:   DEMO [?]
##    DEMO    calculated_Response    Mean    SD
##    <chr>   <chr>                 <dbl> <dbl>
##  1 AIS     DR                     2.92  1.69
##  2 AIS     ER                     1.35  1.13
##  3 BLFEV   DR                     3.5   0.81
##  4 BLFEV   ER                     3.05  0.51
##  5 EAR     DR                   -36.4   8.83
##  6 EAR     ER                   -31.9   9.32
##  7 HT.cm.  DR                   171.    8.91
##  8 HT.cm.  ER                   168.   10.2 
##  9 PRFEV   DR                     3.68  0.62
## 10 PRFEV   ER                     3.67  0.84
## 11 Wt..Kg. DR                    72.2  11.6 
## 12 Wt..Kg. ER                    69    11.9
## post PC20
demo <- readRDS(meta_file_rnapc_dir)
demoPost <- subset(demo, Time == "Post")

demoPostval <- demoPost[paste(as.character(demoPost$NAME), as.character(demoPost$AIC_YMD), sep = "_") %in% paste(as.character(demo1_val$NAME), as.character(demo1_val$AIC_YMD), sep = "_"), ]
table(demoPostval$calculated_Response)
## 
## DR ER 
## 24  9
fit <- lm(demoPostval$PC20 ~ demoPostval$calculated_Response)
plot(fit)

shapiro.test(fit$residuals) ## NOT normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.73553, p-value = 7.1e-06
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = demoPostval$PC20[demoPostval$calculated_Response == "ER"], 
            y = demoPostval$PC20[demoPostval$calculated_Response == "DR"])  # p=0.04
## 
##  Wilcoxon rank sum test
## 
## data:  demoPostval$PC20[demoPostval$calculated_Response == "ER"] and demoPostval$PC20[demoPostval$calculated_Response == "DR"]
## W = 108, p-value = 0.03567
## alternative hypothesis: true location shift is not equal to 0
demoPostval %>% dplyr::select(PC20, calculated_Response) %>% 
  gather(DEMO, Value, -calculated_Response) %>% 
  group_by(DEMO, calculated_Response) %>% 
  dplyr::summarise(Median = round(median(Value, na.rm = TRUE), 2), 
    Q1 = quantile(Value, 0.25, na.rm = TRUE), Q3 = quantile(Value, 0.75, na.rm = TRUE))
## # A tibble: 2 x 5
## # Groups:   DEMO [?]
##   DEMO  calculated_Response Median    Q1    Q3
##   <chr> <chr>                <dbl> <dbl> <dbl>
## 1 PC20  DR                    0.76  0.34  1.55
## 2 PC20  ER                    5.11  2.17  7.98

0.3 create cbc dataset and phenotype vectors

y.train_cbc  <- factor(demo1_dis$calculated_Response, c("ER", "DR"))
names(y.train_cbc ) <- rownames(demo1_dis)
y.test_cbc  <- factor(demo1_val$calculated_Response, c("ER", "DR"))
names(y.test_cbc ) <- rownames(demo1_val)
table(y.train_cbc ); table(y.test_cbc )
## y.train_cbc
## ER DR 
## 15 14
## y.test_cbc
## ER DR 
##  9 24
cbc_train <- na.omit(demo1_dis[, c("PC20", "Leukocyte_Counts.x10.9.", "Neu_percent","lym_percent","mono_percent","eos_percent","baso_percent")])
y.train_cbc <- y.train_cbc[rownames(cbc_train)]
cbc_test <- na.omit(demo1_val[, c("PC20", "Leukocyte_Counts.x10.9.", "Neu_percent","lym_percent","mono_percent","eos_percent","baso_percent")])
y.test_cbc <- y.test_cbc[rownames(cbc_test)]

table(y.test_cbc)
## y.test_cbc
## ER DR 
##  9 20
## statistical tests for cell-types
## which cell frequencies signficantly changes between er and dr at pre-challenge
## Leukocytes
fit <- lm(cbc_test$Leukocyte_Counts.x10.9. ~ y.test_cbc)
plot(fit)

shapiro.test(fit$residuals) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.97067, p-value = 0.578
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) # P=0.10
## 
## Call:
## lm(formula = cbc_test$Leukocyte_Counts.x10.9. ~ y.test_cbc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9230 -0.7433 -0.2230  0.6770  3.0770 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.1333     0.4817  10.657 3.56e-11 ***
## y.test_cbcDR   0.9897     0.5800   1.706   0.0994 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.445 on 27 degrees of freedom
## Multiple R-squared:  0.09734,    Adjusted R-squared:  0.06391 
## F-statistic: 2.912 on 1 and 27 DF,  p-value: 0.09943
## Neutrophils
fit <- lm(cbc_test$Neu_percent ~ y.test_cbc)
plot(fit)

shapiro.test(fit$residuals) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.97585, p-value = 0.725
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) # P=0.04
## 
## Call:
## lm(formula = cbc_test$Neu_percent ~ y.test_cbc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.15300 -0.06359  0.00000  0.07041  0.17900 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.58200    0.02841  20.487   <2e-16 ***
## y.test_cbcDR -0.07241    0.03421  -2.117   0.0436 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08522 on 27 degrees of freedom
## Multiple R-squared:  0.1423, Adjusted R-squared:  0.1106 
## F-statistic: 4.481 on 1 and 27 DF,  p-value: 0.04363
## Lymphocytes
fit <- lm(cbc_test$lym_percent ~ y.test_cbc)
plot(fit)

shapiro.test(fit$residuals) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.94436, p-value = 0.1304
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) # Pt=0.02
## 
## Call:
## lm(formula = cbc_test$lym_percent ~ y.test_cbc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12822 -0.06171  0.01229  0.05629  0.11278 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.29222    0.02442  11.967 2.64e-12 ***
## y.test_cbcDR  0.07549    0.02940   2.567   0.0161 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07326 on 27 degrees of freedom
## Multiple R-squared:  0.1962, Adjusted R-squared:  0.1664 
## F-statistic:  6.59 on 1 and 27 DF,  p-value: 0.01611
## Monocytes
fit <- lm(cbc_test$mono_percent ~ y.test_cbc)
plot(fit)

shapiro.test(fit$residuals) # normal
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.96399, p-value = 0.4103
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

summary(fit) # Pt=0.62
## 
## Call:
## lm(formula = cbc_test$mono_percent ~ y.test_cbc)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.033112 -0.007312  0.000888  0.011888  0.025022 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.074978   0.005136  14.598 2.48e-14 ***
## y.test_cbcDR 0.003134   0.006185   0.507    0.616    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01541 on 27 degrees of freedom
## Multiple R-squared:  0.009421,   Adjusted R-squared:  -0.02727 
## F-statistic: 0.2568 on 1 and 27 DF,  p-value: 0.6164
## Eosinophils
fit <- lm(cbc_test$eos_percent ~ y.test_cbc)
plot(fit)

shapiro.test(fit$residuals)  # NOT normal!
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.82529, p-value = 0.0002458
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = cbc_test$eos_percent[y.test_cbc == "ER"], 
            y = cbc_test$eos_percent[y.test_cbc == "DR"])  # pw=0.70
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cbc_test$eos_percent[y.test_cbc == "ER"] and cbc_test$eos_percent[y.test_cbc == "DR"]
## W = 98.5, p-value = 0.7041
## alternative hypothesis: true location shift is not equal to 0
## Basophils
fit <- lm(cbc_test$baso_percent ~ y.test_cbc)
plot(fit)

shapiro.test(fit$residuals)  # NOT normal!
## 
##  Shapiro-Wilk normality test
## 
## data:  fit$residuals
## W = 0.67347, p-value = 8.952e-07
qqnorm(fit$residuals); qqline(fit$residuals, col = 2)

wilcox.test(x = cbc_test$baso_percent[y.test_cbc == "ER"], 
            y = cbc_test$baso_percent[y.test_cbc == "DR"])  # pw=0.17
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cbc_test$baso_percent[y.test_cbc == "ER"] and cbc_test$baso_percent[y.test_cbc == "DR"]
## W = 60.5, p-value = 0.1683
## alternative hypothesis: true location shift is not equal to 0
# variables that are normally distributed
cbc_test %>% dplyr::select(Leukocyte_Counts.x10.9., Neu_percent, lym_percent, mono_percent) %>% 
  mutate(calculated_Response = as.character(y.test_cbc)) %>% 
  gather(Cells, Value, -calculated_Response) %>% 
  group_by(Cells, calculated_Response) %>% 
  summarise(Mean = round(mean(Value, na.rm = TRUE), 2), SD = round(sd(Value, na.rm = TRUE), 2))
## # A tibble: 8 x 4
## # Groups:   Cells [?]
##   Cells                   calculated_Response  Mean    SD
##   <chr>                   <chr>               <dbl> <dbl>
## 1 Leukocyte_Counts.x10.9. DR                  6.12   1.52
## 2 Leukocyte_Counts.x10.9. ER                  5.13   1.25
## 3 lym_percent             DR                  0.37   0.07
## 4 lym_percent             ER                  0.290  0.08
## 5 mono_percent            DR                  0.08   0.01
## 6 mono_percent            ER                  0.07   0.02
## 7 Neu_percent             DR                  0.51   0.08
## 8 Neu_percent             ER                  0.580  0.1
# variables that are NOT normally distributed
cbc_test %>% dplyr::select(eos_percent, baso_percent) %>% 
  mutate(calculated_Response = as.character(y.test_cbc)) %>% 
  gather(Cells, Value, -calculated_Response) %>% 
  group_by(Cells, calculated_Response) %>% 
  dplyr::summarise(Median = round(median(Value, na.rm = TRUE), 2), 
    Q1 = quantile(Value, 0.25, na.rm = TRUE), Q3 = quantile(Value, 0.75, na.rm = TRUE))
## # A tibble: 4 x 5
## # Groups:   Cells [?]
##   Cells        calculated_Response Median     Q1     Q3
##   <chr>        <chr>                <dbl>  <dbl>  <dbl>
## 1 baso_percent DR                    0.01 0.005  0.01  
## 2 baso_percent ER                    0    0.0021 0.006 
## 3 eos_percent  DR                    0.04 0.02   0.0449
## 4 eos_percent  ER                    0.03 0.024  0.059

0.4 Biomarker Panel validation

0.5 compute lambda values for elastic net classifiers

set.seed(32)
pos_hk_elasticNet_lambda <- mapply(function(X.train, X.test, y.train, y.test){
  fit <- enet(X = X.train, Y = y.train, alpha = 0, lambda = NULL, 
            family = "binomial", X.test = X.test, Y.test = y.test, 
            filter = "none", topranked = 10, keepVar = NULL, pop.prev = 0.6, cutoff = NULL)
  fit$lambda
}, X.train = pos_hk_demo1_disList, X.test = pos_hk_demo1_valList, y.train = y.train, y.test = y.test)
pos_hk_elasticNet_lambda
##            ucscGenes          ucscGeneIso              ensembl 
##           0.06176546           0.03495682           0.02673679 
##              trinity                  all             clinical 
##           0.04831687           0.06470647           0.12851145 
##   ucscGenes_clinical ucscGeneIso_clinical     ensembl_clinical 
##           0.06175233           0.17512428           0.06175233 
##     trinity_clinical         all_clinical 
##           0.12958963           0.24929520

0.6 Trinity panel

0.7 Enet

set.seed(32)
pos_hk_elasticNet <- lapply(1 : length(pos_hk_demo1_disList), function(i){
    fit <- enet(X = pos_hk_demo1_disList[[i]], Y = y.train[[i]], alpha = 0, lambda = pos_hk_elasticNet_lambda[i], 
            family = "binomial", X.test = pos_hk_demo1_valList[[i]], Y.test = y.test[[i]], 
            filter = "none", topranked = 10, keepVar = NULL, pop.prev = 0.6, cutoff = NULL)
  auc <- roc(y.test[[i]], as.numeric(fit$probs), ci = TRUE)
  data <- simple_roc(labels = as.numeric(y.test[[i]])-1, scores = fit$probs)
  data$Panel <- names(pos_hk_demo1_disList)[i]
  data$Classifier <- "Elastic net"
  data$AUC <- paste0("AUROC = ", round(as.numeric(auc$auc), 2))
  data$CI <- paste0("95% CI:", round(as.numeric(auc$ci), 2)[1], "-", round(as.numeric(auc$ci), 2)[3])
  data
})
names(pos_hk_elasticNet) <- names(pos_hk_demo1_disList)

0.7.1 Rf

set.seed(3)
pos_hk_rf <- lapply(1 : length(pos_hk_demo1_disList), function(i){
  fit <- rforest(X = pos_hk_demo1_disList[[i]], Y = y.train[[i]],
            family = "binomial", X.test = pos_hk_demo1_valList[[i]], Y.test = y.test[[i]], 
            filter = "none", topranked = 10)
  auc <- roc(y.test[[i]], as.numeric(fit$probs), ci = TRUE)
  data <- simple_roc(labels = as.numeric(y.test[[i]])-1, scores = fit$probs)
  data$Panel <- names(pos_hk_demo1_disList)[i]
  data$Classifier <- "Random forest"
  data$AUC <- paste0("AUROC = ", round(as.numeric(auc$auc), 2))
  data$CI <- paste0("95% CI:", round(as.numeric(auc$ci), 2)[1], "-", round(as.numeric(auc$ci), 2)[3])
  data
})
names(pos_hk_rf) <- names(pos_hk_demo1_disList)

0.8 Plot AUROC

aucDat <- rbind(do.call(rbind, pos_hk_elasticNet), do.call(rbind, pos_hk_rf))
aucDat$Panel[aucDat$Panel == "ucscGenes"] <- "UCSC genes"
aucDat$Panel[aucDat$Panel == "ucscGeneIso"] <- "UCSC gene-isoforms"
aucDat$Panel[aucDat$Panel == "ensembl"] <- "Ensembl"
aucDat$Panel[aucDat$Panel == "trinity"] <- "Trinity"
aucDat$Panel[aucDat$Panel == "all"] <- "Combined"
aucDat$Panel[aucDat$Panel == "clinical"] <- "Clinical"
aucDat$Panel[aucDat$Panel == "ucscGenes_clinical"] <- "UCSC genes+clinical"
aucDat$Panel[aucDat$Panel == "ucscGeneIso_clinical"] <- "UCSC gene-isoforms+clinical"
aucDat$Panel[aucDat$Panel == "ensembl_clinical"] <- "Ensembl+clinical"
aucDat$Panel[aucDat$Panel == "trinity_clinical"] <- "Trinity+clinical"
aucDat$Panel[aucDat$Panel == "all_clinical"] <- "Combined+clinical"
aucDat$Panel <- factor(aucDat$Panel, 
  levels = c("Clinical", "UCSC genes", "UCSC gene-isoforms", "Ensembl", "Trinity", "Combined",
    "UCSC genes+clinical", "UCSC gene-isoforms+clinical", "Ensembl+clinical", "Trinity+clinical", "Combined+clinical"))
aucDat$PanelType <- unlist(lapply(strsplit(as.character(aucDat$Panel), "\\+"), function(i) i[1]))
aucDat$Cohort <- "nanoString - Validation"
aucDat$ClassifierType <- rep("Molecular", nrow(aucDat))
aucDat$ClassifierType[grep("+clinical", aucDat$Panel)] <- "Molecular+Clinical"
aucDat$ClassifierType[grep("Clinical", aucDat$Panel)] <- "Clinical"
aucDat$PanelType <- factor(aucDat$PanelType, levels = c("Clinical", "UCSC genes", "UCSC gene-isoforms", "Ensembl", "Trinity", "Combined"))

## AUC records
aucLabel <- aucDat %>% dplyr::select(Panel, Classifier, PanelType, ClassifierType, AUC, CI, Cohort) %>% 
            group_by(Panel, Classifier, Cohort) %>% slice(1)
ci <- as.data.frame(do.call(rbind, strsplit(gsub("95% CI:", "", aucLabel$CI), "-")))
aucLabel$min <- as.numeric(as.character(ci$V1))
aucLabel$max <- as.numeric(as.character(ci$V2))
aucLabel$auc <- as.numeric(unlist(lapply(strsplit(aucLabel$AUC, "= "), function(i) i[2])))
aucLabel$ClassifierType <- rep("Molecular", nrow(aucLabel))
aucLabel$ClassifierType[grep("+clinical", aucLabel$Panel)] <- "Molecular+Clinical"
aucLabel$ClassifierType[grep("Clinical", aucLabel$Panel)] <- "Clinical"
aucLabel$x <- 0.75
aucLabel$y <- rep(0.1, nrow(aucLabel))
aucLabel$y[aucLabel$ClassifierType == "Molecular+Clinical"] <- 0.25
aucLabel$y.ci <- rep(0.05, nrow(aucLabel))
aucLabel$y.ci[aucLabel$ClassifierType == "Molecular+Clinical"] <- 0.2
colPalette <- c("dodgerblue", "black")
pdf(paste0(rnaelements_dir, "/results/Figures/Figure4/Figure4b_revised.pdf"), width = 15, height = 6.5)
ggplot(aucDat, aes(x = FPR, y = TPR, color = ClassifierType)) +
  geom_rect(data = subset(aucDat, PanelType == "Trinity"), 
           aes(fill = PanelType),xmin = -Inf,xmax = Inf,
            ymin = -Inf,ymax = Inf, alpha = 0.01, color = "#FC8D62") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", col = "black") + 
  geom_point() + geom_line() +
  facet_grid(Classifier ~ PanelType) +
  customTheme(sizeStripFont = 12, xAngle = 0, hjust = 0.5, vjust = 0.5, 
              xSize = 10, ySize = 10, xAxisSize = 10, yAxisSize = 10) +
  #scale_color_manual(values=colPalette, name = "Panel") +
  geom_text(data = aucLabel, aes(x = x, y = y, label = AUC), size = 3) +
  geom_text(data = aucLabel, aes(x = x, y = y.ci, label = CI), size = 3) +
  xlab("False Positive Rate") + ylab("True Positive Rate")
dev.off()
## png 
##   2

0.9 Trinity biomarker fold-changes

X <- rbind(pos_hk_demo1_disList$trinity, pos_hk_demo1_valList$trinity)
Y <- c(demo1_dis$calculated_Response, demo1_val$calculated_Response)
colnames(X)[colnames(X) == "unknown1_comp54405_c1_seq1"] <- "TNFRSF10C_intron"
colnames(X)[colnames(X) == "unknown2_comp55647_c0_seq2"] <- "2_LOC101927568_intron"
colnames(X)[colnames(X) == "unknown2a_comp55647_c0_seq2"] <- "2a_LOC105378945_intron"
colnames(X)[colnames(X) == "unknown3_comp56590_c0_seq8"] <- "TNFRSF10C_intron.LOC254896_exon"

ggplotColours <- function(n = 6, h = c(0, 360) + 15){
  if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
  hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}
ann_colors <- list(Response = ggplotColours(n=2))
## Figure 4b and c
result <- mixOmics::pca(X, scale = TRUE, center = TRUE)
pdf(rnae_5a_dir, width = 5, height = 4)
plotIndiv(result, ind.names = FALSE, ellipse = TRUE, star = TRUE, ellipse.level = 0.68, 
         group = Y, col.per.group = ann_colors$Response, legend = TRUE, pch = 19, 
  title = "Trinity panel: Individuals")
dev.off()
## png 
##   2
pdf(rnae_5b_dir, width = 6, height = 4)
plotLoadings(result, title = "Trinity panel: Variable Contribution", size.name = 0.6, size.title = 1)
dev.off()
## png 
##   2
# Figure 5c
scaledX <- scale(X)
scaledX[which(scaledX < -2)] <- -2
scaledX[which(scaledX > 2)] <- 2
NMF::aheatmap(t(scaledX), annCol = list(Response = Y),
    border_color = "black", annColors = ann_colors, scale = 'row', 
  filename = rnae_5c_dir, width = 12, height = 6)

0.10 apply Trinity panel to all subjects of the discovery and validation cohort

colMeans(pos_hk_demo1_disList$trinity[y.train$trinity == "DR", ])-colMeans(pos_hk_demo1_disList$trinity[y.train$trinity == "ER", ])
##                          CASP8                          CECR1 
##                    -0.24248974                     0.03176156 
##                          FNIP1                           FPR2 
##                    -0.33047723                    -0.17945569 
##                           LYST                            QKI 
##                    -0.10204956                    -0.15426356 
##                           SETX                          SF3B1 
##                    -0.29872154                    -0.34005866 
##     unknown1_comp54405_c1_seq1     unknown2_comp55647_c0_seq2 
##                    -0.43329914                    -0.09372008 
##    unknown2a_comp55647_c0_seq2     unknown3_comp56590_c0_seq8 
##                    -0.21498836                    -0.14155401 
##  FPR1_intron_comp17070_c0_seq1 IFRD1_intron_comp41141_c0_seq1 
##                    -0.19950686                    -0.18606152
set.seed(121)
## Elastic net
fit <- enet(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity, alpha = pos_hk_elasticNet_lambda["trinity"], lambda = NULL, 
            family = "binomial", X.test = pos_hk_demo1_valList$trinity, Y.test = y.test$trinity, 
            filter = "none", topranked = 10, keepVar = NULL, pop.prev = 0.6, cutoff = NULL)

## using a cut-off of 0.5
pred <- rep(NA, length(fit$probs))
pred[fit$probs >= 0.5] <- "DR"
pred[fit$probs < 0.5] <- "ER"
table(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity)
##     truth
## pred ER DR
##   ER  4  5
##   DR  5 19
calcPerf(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity, prev = 0.6)
##      sens      spec       npv       ppv  accuracy 
## 0.7916667 0.4444444 0.5871560 0.7276596 0.6969697
# tune cut-off
#fit$perfTest

## random forest
fit <-  rforest(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity,
            family = "binomial", pos_hk_demo1_valList$trinity, Y.test = y.test$trinity, 
            filter = "none", topranked = 10)
## using a cut-off of 0.5
pred <- rep(NA, length(fit$probs))
pred[fit$probs >= 0.5] <- "DR"
pred[fit$probs < 0.5] <- "ER"
table(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity)
##     truth
## pred ER DR
##   ER  2  3
##   DR  7 21
calcPerf(pred=factor(pred, levels(y.test$trinity)), truth=y.test$trinity, prev = 0.6)
##      sens      spec       npv       ppv  accuracy 
## 0.8750000 0.2222222 0.5423729 0.8552036 0.6969697
# tune cut-off
#fit$perfTest

0.11 apply Trinity panel

0.11.1 different response

## differentiall expression of flippers
all(rownames(demo1_val_repeatDifferentResponse) == rownames(pos_hk_demo1_val_repeatDifferentResponseList$trinity))
## [1] TRUE
group <- factor(demo1_val_repeatDifferentResponse$calculated_Response, c("ER", "DR"))
subject <- as.character(demo1_val_repeatDifferentResponse$NAME)
table(as.character(demo1_val_repeatDifferentResponse$NAME), demo1_val_repeatDifferentResponse$calculated_Response)
##         
##          DR ER
##   DMB     3  1
##   H-I     1  1
##   I-H     1  1
##   JPG     1  1
##   LRF     2  1
##   STS008  1  1
##   ZPG     1  1
unique(subject)
## [1] "DMB"    "H-I"    "I-H"    "JPG"    "LRF"    "ZPG"    "STS008"
comparisons <- lapply(1 : ncol(pos_hk_demo1_val_repeatDifferentResponseList$trinity), function(i){
  gene <- as.vector(as.matrix(pos_hk_demo1_val_repeatDifferentResponseList$trinity[, i]))
  df <- data.frame(gene, group, subject)
  groupedDat <- groupedData(gene ~ group | subject, data = df)
  fit <- lme(gene ~ group, data = groupedDat, random = ~ 1 | subject, na.action = na.omit)
  coef(summary(fit))[2,]
})

# Elastic net
fit_enet <- enet(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity, alpha = 0, 
                 lambda = pos_hk_elasticNet_lambda["trinity"], 
            family = "binomial", X.test = pos_hk_demo1_val_repeatDifferentResponseList$trinity, Y.test = factor(demo1_val_repeatDifferentResponse$calculated_Response, levels(y.train$trinity)), 
            filter = "none", topranked = 10, keepVar = NULL, pop.prev = 0.6, cutoff = NULL)

Y.test = factor(demo1_val_repeatDifferentResponse$calculated_Response, levels(y.train$trinity))
pred_enet <- rep(NA, length(fit_enet$probs))
pred_enet[fit_enet$probs >= 0.5] <- "DR"
pred_enet[fit_enet$probs < 0.5] <- "ER"
enetPerf_flippers <- calcPerf(pred=factor(pred_enet, levels(Y.test)), truth=Y.test, prev = 0.6)

# random forest
fit_rf <- rforest(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity,
            family = "binomial", X.test = pos_hk_demo1_val_repeatDifferentResponseList$trinity, Y.test = factor(demo1_val_repeatDifferentResponse$calculated_Response, levels(y.train$trinity)), 
            filter = "none", topranked = 10)

Y.test = factor(demo1_val_repeatDifferentResponse$calculated_Response, levels(y.train$trinity))
pred_rf <- rep(NA, length(fit_rf$probs))
pred_rf[fit_rf$probs >= 0.5] <- "DR"
pred_rf[fit_rf$probs < 0.5] <- "ER"
rfPerf_flippers <- calcPerf(pred=factor(pred_rf, levels(Y.test)), truth=Y.test, prev = 0.6)

perf_flippers <- round(rbind(enetPerf_flippers, rfPerf_flippers), 2) %>% as.data.frame %>% 
  mutate(Classifier = c("Elastic net", "Random forest")) %>% 
    gather(Perf, value, -Classifier)
perf_flippers$x <- rep(0.2, nrow(perf_flippers))
perf_flippers$y <- rep(c(-25, -27, -29, -31, -40), each = 2)
perf_flippers$Subject <- "ID"
perf_flippers$Response <- "ER"
perf_flippers$Perf[perf_flippers$Perf == "sens"] <- "Sens"
perf_flippers$Perf[perf_flippers$Perf == "spec"] <- "Spec"
perf_flippers$Perf[perf_flippers$Perf == "npv"] <- "NPV"
perf_flippers$Perf[perf_flippers$Perf == "ppv"] <- "PPV"
perf_flippers$Perf[perf_flippers$Perf == "accuracy"] <- "Accuracy"
perf_flippers$label <- paste(perf_flippers$Perf, "=", paste0(perf_flippers$value))
perf_flippers$Group <- "Different Response"
perf_flippers$Allergen <- "Cat" ## made up

repeatedResponse <- data.frame(prob = c(as.numeric(fit_enet$probs), as.numeric(fit_rf$probs)),
           lar =  c(demo1_val_repeatDifferentResponse$LAR, demo1_val_repeatDifferentResponse$LAR), 
           Subject = c(demo1_val_repeatDifferentResponse$concealed_ID, demo1_val_repeatDifferentResponse$concealed_ID),
           Response = c(demo1_val_repeatDifferentResponse$calculated_Response, demo1_val_repeatDifferentResponse$calculated_Response),
           Classifier = rep(c("Elastic net", "Random forest"), each = nrow(demo1_val_repeatDifferentResponse)),
           Allergen = c(as.character(demo1_val_repeatDifferentResponse$Allergen_cleanLabel), as.character(demo1_val_repeatDifferentResponse$Allergen_cleanLabel)))
repeatedResponse$Group = "Different Response"

0.11.2 repeated DRs

0.11.3 different response

## differentiall expression of flippers
all(rownames(demo1_val_repeatDR) == rownames(pos_hk_demo1_val_repeatDRList$trinity))
## [1] TRUE
group <- factor(demo1_val_repeatDR$calculated_Response, c("ER", "DR"))
subject <- as.character(demo1_val_repeatDR$NAME)
unique(subject)
## [1] "AKA"    "FWC"    "GCC"    "JPH"    "LAG"    "LJJ"    "TLC"    "STS009"
## [9] "STS011"
table(subject); length(subject)
## subject
##    AKA    FWC    GCC    JPH    LAG    LJJ STS009 STS011    TLC 
##      2      2      2      3      2      3      2      2      2
## [1] 20
# Elastic net
fit_enet <- enet(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity, alpha = 0, 
                 lambda = pos_hk_elasticNet_lambda["trinity"], 
            family = "multinomial", X.test = pos_hk_demo1_val_repeatDRList$trinity, Y.test = factor(demo1_val_repeatDR$calculated_Response, levels(y.train$trinity)), 
            filter = "none", topranked = 10, keepVar = NULL, pop.prev = 0.6, cutoff = NULL)

Y.test = factor(demo1_val_repeatDR$calculated_Response, levels(y.train$trinity))
pred_enet <- rep(NA, length(fit_enet$probs[,,1][,"DR"]))
pred_enet[fit_enet$probs[,,1][,"DR"] >= 0.5] <- "DR"
pred_enet[fit_enet$probs[,,1][,"DR"] < 0.5] <- "ER"
enetPerf_DRs <- calcPerf(pred=factor(pred_enet, levels(Y.test)), truth=Y.test, prev = 0.6)

# random forest
fit_rf <- rforest(X = pos_hk_demo1_disList$trinity, Y = y.train$trinity,
            family = "multinomial", X.test = pos_hk_demo1_val_repeatDRList$trinity, Y.test = factor(demo1_val_repeatDR$calculated_Response, levels(y.train$trinity)), 
            filter = "none", topranked = 10)

Y.test = factor(demo1_val_repeatDR$calculated_Response, levels(y.train$trinity))
pred_rf <- rep(NA, length(fit_rf$probs))
pred_rf[fit_rf$probs >= 0.5] <- "DR"
pred_rf[fit_rf$probs < 0.5] <- "ER"
rfPerf_DRs <- calcPerf(pred=factor(pred_rf, levels(Y.test)), truth=Y.test, prev = 0.6)

perf_DRs <- round(rbind(enetPerf_DRs, rfPerf_DRs), 2) %>% as.data.frame %>% 
  mutate(Classifier = c("Elastic net", "Random forest")) %>% 
    gather(Perf, value, -Classifier)
perf_DRs$x <- rep(0.2, nrow(perf_flippers))
perf_DRs$y <- rep(c(-25, -27, -29, -31, -40), each = 2)
perf_DRs$Subject <- "ID"
perf_DRs$Response <- "ER"
perf_DRs$Perf[perf_DRs$Perf == "sens"] <- "Sens"
perf_DRs$Perf[perf_DRs$Perf == "spec"] <- "Spec"
perf_DRs$Perf[perf_DRs$Perf == "npv"] <- "NPV"
perf_DRs$Perf[perf_DRs$Perf == "ppv"] <- "PPV"
perf_DRs$Perf[perf_DRs$Perf == "accuracy"] <- "Accuracy"
perf_DRs$label <- paste(perf_DRs$Perf, "=", paste0(perf_DRs$value))
perf_DRs$Group <- "repeated DRs"
perf_DRs$Allergen <- "Cat" ## made up

repeatedDRs <- data.frame(prob = c(as.numeric(fit_enet$probs[,,1][,"DR"]), as.numeric(fit_rf$probs)),
           lar =  c(demo1_val_repeatDR$LAR, demo1_val_repeatDR$LAR), 
           Subject = c(demo1_val_repeatDR$concealed_ID, demo1_val_repeatDR$concealed_ID),
           Response = c(demo1_val_repeatDR$calculated_Response, demo1_val_repeatDR$calculated_Response),
           Classifier = rep(c("Elastic net", "Random forest"), each = nrow(demo1_val_repeatDR)),
           Allergen = c(as.character(demo1_val_repeatDR$Allergen_cleanLabel), as.character(demo1_val_repeatDR$Allergen_cleanLabel)))
repeatedDRs$Group <- "repeated DRs"

0.11.4 Application of the trinity panel to subjects with a different response and repeated DRs

pdf(rnae_5d_dir, width = 12, height = 8)
rbind(repeatedResponse, repeatedDRs) %>% 
  ggplot(aes(x = prob, y = lar, group = Subject, color = Allergen)) + 
  geom_line(color = "black") + geom_point(size = 5) + facet_grid(Group~Classifier) +
  customTheme(sizeStripFont = 25, xAngle = 0, hjust = 0.5, vjust = 0.5, 
              xSize = 15, ySize = 15, xAxisSize = 15, yAxisSize = 15) +
  scale_y_continuous(expression('Maximum percent drop in '~ FEV[1]~" (3-7h)")) + 
  xlab("Likelihood of being a dual responder (based on Trinity panel)") +
  geom_hline(yintercept = -15, linetype = "dashed") + geom_vline(xintercept = 0.5, col = "gray") +
  geom_rect(xmin = 0, xmax = 0.5, ymin = -15,ymax = 0, alpha = 0.005, 
            fill = ann_colors$Response[2], color = ann_colors$Response[2]) +
  geom_rect(xmin = 0.5, xmax = 1, ymin = -50,ymax = -15, alpha = 0.005, 
            fill = ann_colors$Response[1], color = ann_colors$Response[1]) + 
  theme(legend.text = element_text(size = 16)) +
  scale_x_continuous(breaks= c(0, 0.25, 0.5, 0.75, 1),
                      labels=c("0", "0.25", "0.5", "0.75", "1")) +
  geom_text(data = subset(rbind(perf_flippers, perf_DRs), Perf == "Accuracy"), 
            aes(x = x, y = y, label = label), col = "black", size = 6) +
  geom_text(x = 0.4, y = -5, label = "ERs", col = ann_colors$Response[2], size = 8) +
  geom_text(x = 0.6, y = -40, label = "DRs", col = ann_colors$Response[1], size = 8)
dev.off()
## png 
##   2